/**** This dofile is to randomize schools for TVET program

*/

clear all
capture log close
set more off
set varabbrev off

log using "$logfile/randomization_school_$date.log", replace


* generate value of random seeds 
drop _all
set obs 1000
set seed 142857 
g value=int(10^6*runiform())
levelsof value
gl randomseed "`r(levels)'"

*******************************************************************************
*						GRADE 11 and 12 STUDENTS
*******************************************************************************
* use baseline dataset
use "$clean/grade1112_baseline_reachable_wgrade.dta", clear

* temporary delete students without schoolname information
drop if schoolname==""

* check respones if the school does not have 11 and 12 students
tab schoolname if level1112==0
drop if level1112==0

* total students taking the survey
bys schoolname: egen size=total(responseid~=.)
bys schoolname: egen size12=total(responseid~=. & grade==12)
g surveyrate = size/total1112 
replace surveyrate=1 if surveyrate>1

* drop school with less than 50% survey rate
drop if surveyrate<0.5

* academic stream variables
g stream_art = stream==1
g stream_com = stream==2
g stream_sci = stream==3

* wealth index
foreach i in x11a x11b x11c x11d x11e x11f x11g x11h x11i {
	g i_`i'=`i'
	recode i_`i' (99=.)
	recode i_`i' (2=0)
}
pca i_x11a-i_x11i
predict wealth_index, score

g age = 2021-b_year

* compute school-level mean variable
collapse (mean) size total1112 size12 surveyrate private tti_ks tti_thimphu tti_tyangste dn_tti sex age stream_art stream_com stream_sci c2 c5c1-c5c7 wealth_index, by(schoolname)

* generate group based on geographic variable
g group=1 if tti_ks==1
replace group=2 if tti_thimphu==1
replace group=3 if tti_tyangste==1
tab group

* check if it is feasible to randomize with geographic + private stratification
egen geopri = group(group private)
tab geopri

* check if it is feasiable to randomize with geo + private + size stratification 
levelsof geopri 
global geoprilist "`r(levels)'"
g groupsize=.
foreach i of global geoprilist {
	summ total1112 if geopri==`i', de
	local med`i' = r(p50)
	dis "`med`i''"
	replace groupsize = 1 if total1112<=`med`i'' & geopri==`i'
	replace groupsize = 2 if total1112>`med`i''  & geopri==`i'
}

egen geoprisize = group(group private groupsize)
tab geoprisize

* randomization, stratified by geographic + private group + size
cap drop treat*
foreach j of global randomseed {
set seed `j'
bys geoprisize: g rand_num`j'=runiform()
bys geoprisize: egen ordering`j'=rank(rand_num`j')
bys geoprisize: g treat_`j' = ordering`j'>((_N*2-1)/3)
drop rand_num`j' ordering`j'

cap g n=_n

* Compare means and compute normalized difference	
foreach variable in size total1112 surveyrate private dn_tti sex age stream_art stream_com stream_sci c2 c5c1 c5c2 c5c3 c5c4 c5c5 c5c6 c5c7 wealth_index {
qui ttest `variable', by(treat_`j')
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')

mat R`j'`variable' = t`j', nor_diff`j'
}
mat R`j'=R`j'size/R`j'total1112/R`j'surveyrate/R`j'private/R`j'dn_tti/R`j'sex/R`j'age/R`j'stream_art/R`j'stream_com/R`j'stream_sci/R`j'c2/R`j'c5c1/R`j'c5c2/R`j'c5c3/R`j'c5c4/R`j'c5c5/R`j'c5c6/R`j'c5c7/R`j'wealth_index

qui svmat double R`j'
rename R`j'1 t_`j'
rename R`j'2 NorDiff_`j'

* remove randomization results with
	sort schoolname
	* schools assigned as treatment but do not have grade information
	if treat_`j'[5]==1 | treat_`j'[18]==1 | treat_`j'[29]==1 | treat_`j'[33]==1 {
	drop *_`j'
	} 
	
	* statistically significant difference in means
	cap g tabs_`j'=abs(t_`j')
	cap qui summ tabs_`j'
	if r(max)>1.64 {
		cap drop *_`j'
	}
	cap drop tabs_`j'
}


* keep the result with minimized maximum abs(normalized difference)
preserve 
keep NorDiff* 
foreach v of varlist NorDiff* {
	cap g abs`v'=abs(`v')
	cap egen max_`v' = max(abs`v')
	drop abs`v'
}
egen minmax = rowmin(max*)

foreach v of varlist NorDiff* {
if max_`v' ~= minmax {
	cap drop `v' max_`v'
}
}
cap drop minmax
foreach var of varlist NorDiff* {
local seed = substr("`var'", 9, .)
}

local j = `seed'
restore 

drop treat_* t_* NorDiff_* 

* Export results with the chosen seed
set seed `j'
bys geoprisize: g rand_num`j'=runiform()
bys geoprisize: egen ordering`j'=rank(rand_num`j')
bys geoprisize: g treat_`j' = ordering`j'>((_N*2-1)/3)
drop rand_num`j' ordering`j'

cap g n=_n	
foreach variable in size total1112 surveyrate private dn_tti sex age stream_art stream_com stream_sci c2 c5c1 c5c2 c5c3 c5c4 c5c5 c5c6 c5c7 wealth_index {
qui ttest `variable', by(treat_`j')
mat Cn`j' = r(N_1)
mat Tn`j' = r(N_2)
mat C`j' = r(mu_1)
mat T`j' = r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat diff`j' = r(mu_1) - r(mu_2)
mat t`j' = r(t)
scalar nor`j'=(r(mu_1) - r(mu_2))/sqrt(0.5*(r(sd_1)^2 + r(sd_2)^2))
mat nor_diff`j' = (`=scalar(nor`j')')

mat R`j'`variable' = Cn`j',C`j',Tn`j', T`j',diff`j',t`j', nor_diff`j'

}
mat R`j'=R`j'size/R`j'total1112/R`j'surveyrate/R`j'private/R`j'dn_tti/R`j'sex/R`j'age/R`j'stream_art/R`j'stream_com/R`j'stream_sci/R`j'c2/R`j'c5c1/R`j'c5c2/R`j'c5c3/R`j'c5c4/R`j'c5c5/R`j'c5c6/R`j'c5c7/R`j'wealth_index

qui svmat double R`j'
rename R`j'1 CN_`j'
rename R`j'2 Cmean_`j'
rename R`j'3 TN_`j'
rename R`j'4 Tmean_`j'
rename R`j'5 Diff_`j'
rename R`j'6 t_`j'
rename R`j'7 NorDiff_`j'

cap format C* T* Di* t_* NorDiff* %10.3f
cap format CN* TN* %10.0f

preserve
sort n
cap drop schoolname
cap drop size-geoprisize
cap drop variable
g variable=""
replace variable="number of students taking baseline survey" if n==1
replace variable="total number of students" if n==2
replace variable="baseline survey completion rate" if n==3
replace variable="school is private" if n==4
replace variable="distance to the nearest TTI/IZC (km)" if n==5
replace variable="share of male students" if n==6
replace variable="age of students" if n==7
replace variable="share of students in Art stream" if n==8
replace variable="share of students in Commerce stream" if n==9
replace variable="share of students in Science stream" if n==10
replace variable="share of students taking TVET course" if n==11
replace variable="share of students consulting parents about educareer" if n==12
replace variable="share of students consulting siblings about educareer" if n==13
replace variable="share of students consulting relatives about educareer" if n==14
replace variable="share of students consulting friends about educareer" if n==15
replace variable="share of students consulting neighbors about educareer" if n==16
replace variable="share of students consulting teachers about educareer" if n==17
replace variable="share of students consulting others about educareer" if n==18
replace variable="wealth index" if n==19
keep if n<=20
replace n=0 if n==20
sort n
order variable
replace variable="randomization seed" if n==0
cap drop n 
cap drop treat*

export delimited using "$randomization/randomization_school_TVET_balance.csv", nolabel datafmt replace
restore 

cap keep schoolname treat* size size12 total1112 geoprisize
cap sort geopri treat* schoolname 
export excel using "$randomization/randomization_school_TVET_list.xlsx", replace firstrow(variables)


